1 Introduction

Loading Data

The following libraries will be used.

library(data.table) # database-like tables
library(ggplot2) # general plots
library(highcharter) # interactive plots
library(DT) # display tables
library(corrplot) # corr SPLOMs
library(vcd) # stats for categories
library(mice) # multiple imputation
library(Boruta) # estimate variable importance

Input is loaded and covnerted to data.table.

test <- read.csv("../input/test.csv")
train <- read.csv("../input/train.csv")
test <- data.table(test)
train <- data.table(train)

2 Data Quality

The number of dimensions and sampling density is calculated. The datasets are combined.

(N <- nrow(train))
## [1] 1460
(p <- ncol(train) - 2)
## [1] 79
N^(1/p)
## [1] 1.096617
test$SalePrice <- NaN
full <- rbind(train, test)
full$label <- rep(c("train", "test"), c(nrow(train), nrow(test)))

2.1 Feature Attributes

From the data_description variable types can be defined. There are 4 categories: nominal, ordinal, discrete, continuous. A lot of the discrete variables could arguably be handled as ordinals.

nomVars <- c("MSZoning", "Street", "Alley", "LandContour", "LotConfig",
             "Neighborhood", "Condition1", "Condition2", "HouseStyle",
             "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd",
             "MasVnrType", "Foundation", "Heating", "CentralAir", "Electrical",
             "GarageType", "PavedDrive", "MiscFeature", "SaleType",
             "SaleCondition")
ordVars <- c("MSSubClass", "LotShape", "Utilities", "LandSlope", "BldgType",
             "OverallQual", "OverallCond", "ExterQual", "ExterCond", "BsmtQual",
             "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
             "HeatingQC", "KitchenQual", "Functional", "FireplaceQu",
             "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence",
             "MoSold")
discVars <- c("YearBuilt", "YearRemodAdd", "BsmtFullBath", "BsmtHalfBath",
              "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr",
              "TotRmsAbvGrd", "Fireplaces", "GarageYrBlt", "GarageCars",
              "YrSold")
contVars <- c("LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
              "TotalBsmtSF", "X1stFlrSF", "X2ndFlrSF", "LowQualFinSF",
              "GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF",
              "EnclosedPorch", "X3SsnPorch", "ScreenPorch", "PoolArea",
              "MiscVal", "LotFrontage")

A table is created to store feature attributes.

featAttr <- data.table(
  name = colnames(test)[-1],
  type = "nominal"
)
featAttr[name %in% ordVars, type := "ordinal"]
featAttr[name %in% discVars, type := "discrete"]
featAttr[name %in% contVars, type := "continuous"]

Special Values

According to data_description several \(NA\) have a special meaning. These values are replaced with more descriptive chr strings. Before, test and train set are combined.

full[is.na(Alley), Alley := "noAccess"]
full[is.na(BsmtQual), BsmtQual := "noBasement"]
full[is.na(BsmtCond), BsmtCond := "noBasement"]
full[is.na(BsmtExposure), BsmtExposure := "noBasement"]
full[is.na(BsmtFinType1), BsmtFinType1 := "noBasement"]
full[is.na(BsmtFinType2), BsmtFinType2 := "noBasement"]
full[is.na(FireplaceQu), FireplaceQu := "noFireplace"]
full[is.na(GarageType), GarageType := "noGarage"]
full[is.na(GarageFinish), GarageFinish := "noGarage"]
full[is.na(GarageQual), GarageQual := "noGarage"]
full[is.na(GarageCond), GarageCond := "noGarage"]
full[is.na(PoolQC) , PoolQC := "noPool"]
full[is.na(Fence), Fence := "noFence"]
full[is.na(MiscFeature), MiscFeature := "none"]

Nominal, and ordinal variables are converted to factor, continuous to numeric. Discrete variables contain years and measures like number of cars so they will be converted to integer.

for (j in c(nomVars, ordVars)) full[[j]] <- as.factor(full[[j]])
for (j in contVars) full[[j]] <- as.numeric(full[[j]])
for (j in discVars) full[[j]] <- as.integer(full[[j]])

Ordering

From data_description the order of ordinal variable classes can be derived. These orderings are defined, then the levels of their factors are adjusted.

lvlOrd <- list(
  LotShape = c("Reg", "IR1", "IR2", "IR3"),
  Utilities = c("AllPub", "NoSewr", "NoSeWa", "ELO"),
  LandSlope = c("Gtl", "Mod", "Sev"),
  BldgType = c("1Fam", "2FmCon", "Duplx", "TwnhsE", "TwnhsI"),
  OverallQual = 10:1,
  OverallCond = 10:1,
  ExterQual = c("Ex", "Gd", "TA", "Fa", "Po"),
  ExterCond = c("Ex", "Gd", "TA", "Fa", "Po"),
  BsmtQual = c("Ex", "Gd", "TA", "Fa", "Po", "noBasement"),
  BsmtCond = c("Ex", "Gd", "TA", "Fa", "Po", "noBasement"),
  BsmtExposure = c("Gd", "Av", "Mn", "No", "noBasement"),
  BsmtFinType1 = c("GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "noBasement"),
  BsmtFinType2 = c("GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "noBasement"),
  HeatingQC = c("Ex", "Gd", "TA", "Fa", "Po"),
  KitchenQual = c("Ex", "Gd", "TA", "Fa", "Po"),
  Functional = c("Typ", "Min1", "Min2", "Mod", "Maj1", "Maj2", "Sev", "Sal"),
  FireplaceQu = c("Ex", "Gd", "TA", "Fa", "Po", "noFireplace"),
  GarageFinish = c("Fin", "RFn", "Unf", "noGarage"),
  GarageQual = c("Ex", "Gd", "TA", "Fa", "Po", "noGarage"),
  GarageCond = c("Ex", "Gd", "TA", "Fa", "Po", "noGarage"),
  PoolQC = c("Ex", "Gd", "TA", "Fa", "noPool"),
  Fence = c("GdPrv", "MnPrv", "GdWo", "MnWw", "noFence"),
  MoSold = 1:12
)
for (j in names(lvlOrd)) {
  full[[j]] <- factor(full[[j]], levels = lvlOrd[[j]])
}

2.2 Feature Dependencies

Some features’ values heavily depend on the value of another feature. For example the values of \(BsmtUnfSF\), \(TotalBsmtSF\) only make sense if features \(BsmtCond\), \(BsmtExposure\), \(BsmtQual\) are not \(noBasement\). This information is given in data_description.

\[ \begin{aligned} BsmtUnfSF, TotalBsmtSF : BsmtCond, BsmtExposure, BsmtQual \ne noBasement\\ BsmtFinSF1 : BsmtFinType1 \ne noBasement\\ BsmtFinSF2 : BsmtFinType2 \ne noBasement \\ GarageYrBlt, GarageCars, GarageArea: GarageType, GarageFinish, GarageQual, GarageCond \ne noGarage \\ PoolArea : PoolQC \ne noPool \\ MiscVal : MiscFeature \ne none \\ MasVnrArea : MasVnrType \ne none \end{aligned} \] For the garage and basement features there are several features which indicate the absence of that feature. If one of them indicates the absence of a feature, the others should too. This can be tested with the difference between two sets.

outersect <- function(x, y) sort(c(setdiff(x, y), setdiff(y, x)))

The differences between Ids of observations with noBasement are computed.

outersect(which(full$BsmtCond == "noBasement"),
          which(full$BsmtExposure == "noBasement"))
## [1]  949 1488 2041 2186 2349 2525
outersect(which(full$BsmtCond == "noBasement"),
          which(full$BsmtQual == "noBasement"))
## [1] 2041 2186 2218 2219 2525
full[c(949, 1488, 2041, 2186, 2218, 2219, 2349, 2525), 
     .(Id, BsmtCond, BsmtExposure, BsmtQual, BsmtUnfSF, TotalBsmtSF)]
##      Id   BsmtCond BsmtExposure   BsmtQual BsmtUnfSF TotalBsmtSF
## 1:  949         TA   noBasement         Gd       936         936
## 2: 1488         TA   noBasement         Gd      1595        1595
## 3: 2041 noBasement           Mn         Gd         0        1426
## 4: 2186 noBasement           No         TA        94        1127
## 5: 2218         Fa           No noBasement       173         173
## 6: 2219         TA           No noBasement       356         356
## 7: 2349         TA   noBasement         Gd       725         725
## 8: 2525 noBasement           Av         TA       240         995

So here the data_description is not consistent. The general condition is set to noBasement but there are values for basement exposure, quality, and so on. noBasement was translated from NAs. Since for all the other basement related variables there are reasonable values it might be that in these cases NA did not mean noBasement.

The same can be done for the garage related variables.

outersect(which(full$GarageType == "noGarage"),
          which(full$GarageFinish == "noGarage"))
## [1] 2127 2577
outersect(which(full$GarageType == "noGarage"),
          which(full$GarageQual == "noGarage"))
## [1] 2127 2577
outersect(which(full$GarageType == "noGarage"),
          which(full$GarageCond == "noGarage"))
## [1] 2127 2577
full[c(2127, 2577), 
     .(GarageYrBlt, GarageCars, GarageArea, GarageType, 
       GarageFinish, GarageQual, GarageCond)]
##    GarageYrBlt GarageCars GarageArea GarageType GarageFinish GarageQual
## 1:          NA          1        360     Detchd     noGarage   noGarage
## 2:          NA         NA         NA     Detchd     noGarage   noGarage
##    GarageCond
## 1:   noGarage
## 2:   noGarage

Here, most garage related features were missing.

Mark Undefined Variables

If e.g. PoolQC is noPool, then there should be a missing vlaue or a zero for PoolArea. If there actually is a value, this could either mean it is a mistake or it means something special that is not clearly described. To at least be consistent, all NAs will be set to 0.

full[BsmtCond == "noBasement" & is.na(BsmtUnfSF), BsmtUnfSF := 0]
full[BsmtCond == "noBasement" & is.na(TotalBsmtSF), TotalBsmtSF := 0]
full[BsmtFinType1 == "noBasement" & is.na(BsmtFinSF1), BsmtFinSF1 := 0]
full[BsmtFinType2 == "noBasement" & is.na(BsmtFinSF2), BsmtFinSF2 := 0]
full[GarageType == "noGarage" & is.na(GarageYrBlt), GarageYrBlt := 0]
full[GarageType == "noGarage" & is.na(GarageCars), GarageCars := 0]
full[GarageType == "noGarage" & is.na(GarageArea), GarageArea := 0]
full[PoolQC == "noPool" & is.na(PoolArea), PoolArea := 0]
full[MiscFeature == "noFeature" & is.na(MiscVal), MiscVal := 0]

2.3 Missing Data

\(NA\)s which are left must be true missing data.

naFeats <- apply(full[, !"SalePrice"], 2, function(x) any(is.na(x)))
naObs <- apply(full[, !"SalePrice"], 1, function(x) any(is.na(x)))

Features

The total number of missing values per feature are shown below.

total <- apply(full[, naFeats, with = FALSE], 2, function(j) sum(is.na(j)))
df <- data.frame(
  missing = total / nrow(full),
  name = factor(names(total), levels = names(total)[order(total)]),
  type = featAttr[match(names(total), featAttr$name), type]
)
ggplot(df, aes(x = name, y = missing, fill = type)) +
  geom_bar(stat = "identity") +
  scale_y_continuous("proportion missing") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ggtitle("Features with Missing Values")

Observations

Below the number of missing values for observations that have missing values is shown. Many observations have 1 or 2, some have 3 or 4 missing values.

total <- apply(full[naObs, ], 1, function(i) sum(is.na(i)))
df <- data.frame(
  y = total,
  name = full[naObs, Id]
)
highchart() %>%
    hc_chart(type = "column") %>%
    hc_title(text = "Observations with Missing Values") %>%
    hc_xAxis(title = list(text = "observations"), categories = df$name, 
             labels  = list(rotation = "-45")) %>%
    hc_yAxis(title = list(text = "missing"), align = "left") %>%
    hc_add_series(showInLegend = FALSE, data = list_parse(df[order(df$y), ]))

3 Feature Exploration

3.1 Feature Distributions

Feature distributions are described for each type of variable separately.

3.1.1 Continuous Variables

Continuous variables are very much skewed towards zero. Partly because there are many zeros, partly because they are log-normal distributed. Here, they are log10 transformed to better capture the spread. (log10 because it’s nicer to interpret than log) A new table full2 will hold the transformed values.

full2 <- full
for (j in contVars) {
  full2[[j]] <- log10(full[[j]] + 1)
}

Their distributions are shown below.

dt <- melt(full2, "Id", contVars)
ggplot(dt, aes(x = value)) +
  geom_histogram(bins = 30) +
  facet_wrap(~ variable, scale = "free") +
  theme_bw()

Some distributions are not visible because of all the zeros. The next plot shows the same, excluding zeros.

dt <- melt(full, "Id", contVars)
ggplot(dt, aes(x = value)) +
  geom_histogram(bins = 30) +
  scale_x_continuous(trans = "log10") +
  facet_wrap(~ variable, scale = "free") +
  theme_bw()

Summaries for each distribution are computed. There are many undefined variables which are set to zero. This influences the description a lot and is not of particular interest. Therefore, they are excluded.

dt <- full2[, contVars, with = FALSE]
df <- data.frame(
  Min = apply(dt, 2, function(x) min(x[x > 0], na.rm = TRUE)),
  Mean = apply(dt, 2, function(x) mean(x[x > 0], na.rm = TRUE)),
  Median = apply(dt, 2, function(x) median(x[x > 0], na.rm = TRUE)),
  Max = apply(dt, 2, function(x) max(x[x > 0], na.rm = TRUE))
)

Relative difference between mean and median (relDelta) are computed as an indicator for skewness. Also, relative cardinality and relative number of outliers are computed. Outliers are defined as being differing at least \(+/- 1.5 MAD\) from the median.

df$relDelta <- abs(df$Mean - df$Median) / df$Mean
df$relCard <- apply(dt, 2, function(x) {
  length(unique(x[x > 0])) / length(x[x > 0])
})
df$relOut <- apply(dt, 2, function(x) {
  length(boxplot.stats(x[x > 0])$out) / length(x[x > 0])
})

Below is a summary table. There are some outliers for PoolArea but this is because most values in this variable are zero.

DT::datatable(df, options = list(dom = "ftp", pageLength = 20)) %>% 
  formatStyle('relDelta', 
    color = styleInterval(c(0.2, 0.8), c('green', 'orange', 'red'))) %>%
  formatStyle('relCard', 
    color = styleInterval(c(0.2, 0.6), c('red', 'orange', 'green'))) %>%
  formatStyle('relOut', 
    color = styleInterval(c(0.05, 0.1), c('green', 'orange', 'red'))) %>%
  formatRound(1:7, 3)

3.1.2 Discrete Variables

Discrete variables contain years and numerical measures. They are treated as continuous here.

dt <- melt(full, "Id", discVars)
ggplot(dt, aes(x = value)) +
  geom_histogram(bins = 30) +
  facet_wrap(~ variable, scale = "free") +
  theme_bw()

A summary, as above is computed.

dt <- full[, discVars, with = FALSE]
df <- data.frame(
  Min = apply(dt, 2, function(x) min(x[x > 0], na.rm = TRUE)),
  Mean = apply(dt, 2, function(x) mean(x[x > 0], na.rm = TRUE)),
  Median = apply(dt, 2, function(x) median(x[x > 0], na.rm = TRUE)),
  Max = apply(dt, 2, function(x) max(x[x > 0], na.rm = TRUE))
)

Skewness and cardinality would be less interesting here. Instead the modes will be extracted.

Mode <- function(x, fst = TRUE) {
  ux <- unique(x)
  tab <- tabulate(match(x, ux))
  modeIdx <- which.max(tab)
  if (fst) return(list(value = ux[modeIdx], freq = tab[modeIdx]))
  tab[modeIdx] <- 0
  modeIdx <- which.max(tab)
  return(list(value = ux[modeIdx], freq = tab[modeIdx]))
}

First and second modes are computed and the table is shown. Modes that make up more than 50% of observations will be highlighted.

df$Mode <- apply(dt, 2, function(x) Mode(x)$value)
df$ModeCount <- apply(dt, 2, function(x) Mode(x)$freq)
df$Mode2nd <- apply(dt, 2, function(x) Mode(x, fst = FALSE)$value)
df$ModeCount2nd <- apply(dt, 2, function(x) Mode(x, fst = FALSE)$freq)
DT::datatable(df, options = list(dom = "ftp", pageLength = 20)) %>% 
  formatStyle('ModeCount', 
    color = styleInterval(nrow(dt) / 2, c("black", 'orange'))) %>%
  formatRound(2, 0)

In GarageYrBlt the maximum value is 2207, which does not make sense. It seems that this is a typo and the real value is 2007. After checking that there are not more values above 2010 2207 is changed to 2007.

full[GarageYrBlt == 2207, GarageYrBlt := 2007]

3.1.3 Categorical Variables

Ordinal Variables

dt <- melt(full, "Id", ordVars)
ggplot(dt, aes(x = value)) +
  geom_bar() +
  facet_wrap(~ variable, scales = "free") +
  theme_bw()

Nominal Variables

dt <- melt(full, "Id", nomVars)
ggplot(dt, aes(x = value)) +
  geom_bar() +
  facet_wrap(~ variable, scales = "free") +
  theme_bw()

Summary

Cardinality, 1st and 2nd modes are given as summary. Modes that make up more than 50% of observations will be highlighted.

dt <- full[, c(ordVars, nomVars), with = FALSE]
df <- data.frame(
  Card = apply(dt, 2, function(x) length(unique(x[!is.na(x)]))),
  Mode = apply(dt, 2, function(x) Mode(x)$value),
  ModeCount = apply(dt, 2, function(x) Mode(x)$freq),
  Mode2nd = apply(dt, 2, function(x) Mode(x, fst = FALSE)$value),
  ModeCount2nd = apply(dt, 2, function(x) Mode(x, fst = FALSE)$freq),
  Type = rep(c("ord", "nom"), c(length(ordVars), length(nomVars)))
)
DT::datatable(df, options = list(dom = "ftp", pageLength = 20)) %>% 
  formatStyle('Type', 
    color = styleEqual(unique(df$Type), c('orange', 'blue'))) %>% 
  formatStyle('ModeCount', 
    color = styleInterval(nrow(dt) / 2, c("black", 'orange')))

3.2 Correlations between Features

3.2.1 Continuous and Discrete

vars <- c(contVars, discVars)

The Pearson Correlation Coefficient is calculated for all feature pairs where both features are continuous. The results are shown as heatmap below. Using euclidean distance of the correlation matrix as a metric rows and columns were hierarchically ordered (complete link). Here, the log10 transformed values are used.

m <- cor(
  full2[, vars, with = FALSE],
  use = "complete.obs"
)
corrplot(m, method = "color", order = "hclust")

Some clusters are visible. The same hierarchical clustering is shown as tree below.

corrClust <- hclust(dist(m))
plot(corrClust)

3.2.2 Categorical

vars <- c(ordVars, nomVars)

For categorical features Cramer’s V is calculated for all feature pairs to indicate correlation. The results are shown as heatmap below.

m <- as.matrix(full[, vars, with = FALSE])
m <- m[complete.cases(m), ]
mCramer <- matrix(NA, ncol(m), ncol(m))
for (i in seq_len(ncol(m))) {
  mCramer[i, ] <- vapply(
    seq_len(ncol(m)), 
    function(j) assocstats(table(m[, i], m[, j]))$cramer, 
    numeric(1)
  )
}

Using euclidean distance of the correlation matrix as a metric rows and columns were hierarchically ordered (complete link).

colnames(mCramer) <- colnames(m)
rownames(mCramer) <- colnames(m)
corrplot(mCramer, method = "color", order = "hclust")

The tree for the clustering:

corrClust <- hclust(dist(mCramer))
plot(corrClust)

3.2.3 Continuous, Discrete, Ordinal

Ordinal variable levels were ordered before. Here, they are converted to integer and then treated as continuous.

for (j in ordVars) {
  full2[[j]] <- as.integer(full2[[j]])
}

Except for 1 non-missing observation Utilities consists of only one value with NoSeWa. The rest is AllPub. Correlation cannot be calculated with this. It will be therefore be excldued here.

vars <- c(contVars, discVars, ordVars)
vars <- vars[!vars == "Utilities"]

Now they are compared to continuous and discrete variables.

m <- cor(
  full2[, vars, with = FALSE],
  use = "complete.obs"
)
corrplot(m, method = "color", order = "hclust")

corrClust <- hclust(dist(m))
plot(corrClust)


4 Imputation

4.1 Single Imputation

Single missing values in basement related features, where all other basement related features did have a value, are replaced by their mode.

full[c(949, 1488, 2349), BsmtExposure := "No"]
full[c(2041, 2186, 2525), BsmtCond := "TA"]
full[c(2218, 2219), BsmtQual := "TA"]

For garage related features there are two cases with almost no non-missing value for garage. For these two cases the garage features are set to noGarage.

full[c(2127, 2577), GarageYrBlt := 0]
full[c(2127, 2577), GarageCars := 0]
full[c(2127, 2577), GarageArea := 0]
full[c(2127, 2577), GarageType := "noGarage"]

Features with only very few missing values will be imputed using the mode for categories and the median for continuous variables.

full[is.na(Exterior1st), Exterior1st := "VinylSd"]
full[is.na(Exterior2nd), Exterior2nd := "VinylSd"]
full[is.na(Electrical), Electrical := "SBrkr"]
full[is.na(KitchenQual), KitchenQual := "TA"]
full[is.na(SaleType), SaleType := "WD"]
full[is.na(Utilities), Utilities := "AllPub"]
full[is.na(Functional), Functional := "Typ"]
full[is.na(MSZoning), MSZoning := "RL"]
full[is.na(MasVnrArea), MasVnrArea := 202]
full[is.na(MasVnrType), MasVnrType := "none"]

There were two observations with NA values for basement bath related variables in which all other basement related variables were set to noBasement. These will be set to zreo.

full[is.na(BsmtFullBath), BsmtFullBath := 0]
full[is.na(BsmtHalfBath), BsmtHalfBath := 0]

4.2 Multiple Imputation

BldgType and LotFrontage have around 10-15% missing values. These will be imputed by Gibbs sampling of the posterior distribution. Variables with missing values are repeatedly predicted given a set of predictor variables. For LotFrontage predictive mean matching, and for BldgType polytomous regression will be used. As predictors all continuous and discrete variables, and the the 2 categorical variables which are most correlated to BldgType are included. Several datasets are created by this method but only one will be used.

vars <- c(contVars, discVars, "BldgType", "MSSubClass", "HouseStyle")

The transformed values will be used.

full2 <- full
for (j in contVars) {
  full2[[j]] <- log10(full2[[j]] + 1)
}
dt <- full2[, vars, with = FALSE]
imp <- mice(dt, seed = 42)

LotFrontage

The distribution of LotFrontage after imputation is compared to the original distribution.

df <- data.frame(
  LotFrontage = c(dt$LotFrontage, complete(imp)$LotFrontage),
  Distro = rep(c("Original", "Imputed"), each = nrow(dt))
)
ggplot(df, aes(x = LotFrontage, color = Distro)) +
  geom_density() +
  theme_bw()

Both distributions look very similar. Furthermore, its correlation with the 4 most correlated continuous variables is compared.

df <- data.frame(
  Value = c(dt$LotArea, dt$X1stFlrSF, dt$GrLivArea, dt$MasVnrArea),
  Variable = rep(c("LotArea", "X1stFlrSF", "GrLivArea", "MasVnrArea"), 
                 each = nrow(dt)),
  LotFrontage = rep(complete(imp)$LotFrontage, each = 4),
  isImputed = factor(is.na(dt$LotFrontage), levels = c(TRUE, FALSE))
)
ggplot(df, aes(x = LotFrontage, y = Value, color = isImputed))  +
  geom_point(size = .7) +
  facet_wrap( ~ Variable, scales = "free") +
  theme_bw()

Imputed values seem to resemble the shape of the original data.

BldgType

Imputed and original distributions are compared.

df <- data.frame(
  BldgType = c(as.character(dt$BldgType), as.character(complete(imp)$BldgType)),
  Distro = rep(c("Original", "Imputed"), each = nrow(dt))
)
ggplot(df, aes(x = BldgType, fill = Distro)) +
  geom_bar(position = "dodge") +
  scale_x_discrete() +
  theme_bw()

Next correlation to the closest correlated categorical feature MSSubClass before and after imputation is checked.

df <- data.frame(
  BldgType = c(as.character(dt$BldgType), as.character(complete(imp)$BldgType)),
  MSSubClass = rep(dt$MSSubClass, 2),
  Distro = rep(c("Original", "Imputed"), each = nrow(dt))
)
ggplot(df, aes(x = MSSubClass, fill = BldgType)) +
  geom_bar(position = "dodge") +
  facet_grid(Distro ~ .) +
  theme_bw()

Both

Finally, the correlation of both imputed variables is checked.

df <- data.frame(
  LotFrontage = rep(c(dt$LotFrontage, complete(imp)$LotFrontage), each = 2),
  BldgType = rep(c(as.character(dt$BldgType), 
                   as.character(complete(imp)$BldgType)), each = 2),
  Distro = rep(c("Original", "Imputed"), each = 2*nrow(dt))
)
ggplot(df, aes(x = LotFrontage, color = BldgType)) +
  geom_density() +
  facet_grid(Distro ~ .) +
  theme_bw()
## Warning: Removed 972 rows containing non-finite values (stat_density).

The distribution of \(BldgType = TwnhsE\) changed visibly. However, with less than 10% this value is quite underrepresented so its distribution estimate is less stable. All in all the imputed values seem to be useful

full2$LotFrontage <- complete(imp)$LotFrontage
full2$BldgType <- complete(imp)$BldgType

5 Feature Importance

With many features is might be useful to have an idea about the importance of each feature in order to perform a feature selection.

5.1 Target Variable

Here, the correlation of each feature to the target variable is visualized.

train <- full2[label == "train", !"label"]

As suggested by the dataset description, the target variable is log-normal distributed.

summary(train$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  130000  163000  180900  214000  755000
ggplot(train, aes(x = SalePrice)) +
  geom_histogram(bins = 30) +
  scale_x_continuous(trans = "log10") +
  theme_bw()

Continuous Variables

The plots below visualize the correlatio between continuous variables and the target. The y axis is fixed to enable comparison between the variables. Visually X1stFlrSF and GrLivAreaseem to be important.

vars <- contVars
dt <- melt(train, "Id", vars)
dt$SalePrice <- rep(train$SalePrice, length(vars))
ggplot(dt, aes(y = SalePrice, x = value)) +
  geom_point(size = .5) +
  scale_y_continuous(trans = "log10") +
  facet_wrap(~ variable, scales = "free_x") +
  theme_bw()

Discrete Variables

Here YearBuilt, YearRemodAdd, GarageYrBlt, TotRmsAbvGrd, and FullBath seem important.

vars <- discVars
dt <- melt(train, "Id", vars)
dt$SalePrice <- rep(train$SalePrice, length(vars))
ggplot(dt, aes(y = SalePrice, x = as.factor(value))) +
  geom_boxplot() +
  scale_y_continuous(trans = "log10") +
  facet_wrap(~ variable, scales = "free_x") +
  theme_bw()

Ordinal Variables

Important features here are OverallQualt, ExterQual, BsmtQual, KitchenQual, basically the quality features. From these plots one could conclude that it does make sense to have these features numeric and not as a factor.

vars <- ordVars
dt <- melt(train, "Id", vars)
dt$SalePrice <- rep(train$SalePrice, length(vars))
ggplot(dt, aes(y = SalePrice, x = as.factor(value))) +
  geom_boxplot() +
  scale_y_continuous(trans = "log10") +
  facet_wrap(~ variable, scales = "free_x") +
  theme_bw()

Nominal Variables

There are not many particularly intersting features here. It seems \(MSZoning = C\) and \(CentralAir = Y\) is interesting though.

vars <- nomVars
dt <- melt(train, "Id", vars)
dt$SalePrice <- rep(train$SalePrice, length(vars))
ggplot(dt, aes(y = SalePrice, x = value)) +
  geom_boxplot() +
  scale_y_continuous(trans = "log10") +
  facet_wrap(~ variable, scales = "free_x") +
  theme_bw()

5.2 Buruta

This is a method which repeatedly fits random forest to the training set in order to estimate variable importance. Importance here are Z-scores, their significance is estimated by random permutations. Here, the package boruta is used according to Kursa and Rudnicki. The plot below shows confirmed, tentative and rejected variables in green, yellow, and red respectively.

boruta <- Boruta(train[, -"SalePrice"], train$SalePrice, ntree = 500)
plot(boruta)

boruta
## Boruta performed 99 iterations in 3.187884 mins.
##  48 attributes confirmed important: BedroomAbvGr, BldgType,
## BsmtCond, BsmtExposure, BsmtFinSF1 and 43 more;
##  22 attributes confirmed unimportant: BsmtFinSF2, BsmtHalfBath,
## Condition1, Condition2, Electrical and 17 more;
##  10 tentative attributes left: Alley, BsmtFinType2, EnclosedPorch,
## Fence, Functional and 5 more;

The table below summarizes the procedure. The variable with the highest median Z-score by far is GrLivArea, followed by OverallQual.

df <- attStats(boruta)
DT::datatable(df, options = list(dom = "ftp", pageLength = 20)) %>% 
  formatStyle('decision', 
    color = styleEqual(unique(df$decision), c('red', 'green', 'orange'))) %>% 
  formatRound(1:5, 2)

That’s it! The modeling will be done in another kernel. Thanks for reading through this whole thing.